# download_vjcs_issue_live.py
# VJCS (Vietnam Journal of Computer Science) Downloader
# -------------------------------------------------
# Automates downloading PDFs from VJCS (World Scientific) issue pages
# - Handles HTTP 403 errors using session headers and referer spoofing
# - Parses article titles from issue pages and follows each to its "epdf" link
# - Transforms view URLs to direct PDF downloads with ?download=true parameter
# - Creates dynamic folder names like VJCS_Vol07_Issue01_2020 from issue metadata
# - Saves each PDF with sanitized filenames for cross-platform compatibility
# - Logs all downloads to CSV for tracking
# - Reusable for other World Scientific journals with similar structures
# -------------------------------------------------
# - Paste an issue URL, e.g.: https://www.worldscientific.com/toc/vjcs/07/01
# - Parses article titles from the issue page (h5.issue-item__title a[href])
# - Visits epdf and resolves the real PDF link: /doi/pdf/{DOI}?download=true
# - Saves PDFs to: VJCS_Vol{vol}_Issue{iss}_{yr}
# - Logs results to CSV
#
# Anti-403 strategy:
#   • Prefer cloudscraper (Cloudflare/Akamai bypass). Fallback to requests.Session.
#   • Warm up cookies via / and /loi/vjcs, keep strong headers + Referer chain.
#   • If still blocked, allow local HTML file path as a fallback to parse DOIs.


import re
import csv
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

# Try cloudscraper if present
try:
    import cloudscraper  # type: ignore
    def new_session():
        s = cloudscraper.create_scraper(browser={'browser': 'chrome', 'platform': 'windows', 'mobile': False})
        return s
    USING_CLOUDSCRAPER = True
except Exception:
    def new_session():
        return requests.Session()
    USING_CLOUDSCRAPER = False

BASE = "https://www.worldscientific.com"
TIMEOUT = 60
PAUSE = 0.6  # polite delay

PRIMARY_HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/124.0.0.0 Safari/537.36"),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1",
    # Sec-Fetch headers help some bot checks:
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
    "Connection": "keep-alive",
}

ALT_HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                   "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15"),
    "Accept": PRIMARY_HEADERS["Accept"],
    "Accept-Language": PRIMARY_HEADERS["Accept-Language"],
    "Accept-Encoding": PRIMARY_HEADERS["Accept-Encoding"],
    "Cache-Control": PRIMARY_HEADERS["Cache-Control"],
    "Pragma": PRIMARY_HEADERS["Pragma"],
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
    "Connection": "keep-alive",
}

def sanitize_filename(name: str) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"\.+", ".", name).strip(". ")
    return name[:180]

def get_with_anti403(s: requests.Session, url: str, referer: str | None = None, alt_once: bool = True) -> requests.Response:
    headers = {"Referer": referer} if referer else {}
    try:
        r = s.get(url, headers=headers, timeout=TIMEOUT)
        if r.status_code == 403 and not USING_CLOUDSCRAPER and alt_once:
            # Switch header profile once for requests.Session fallback
            old = dict(s.headers)
            s.headers.clear()
            s.headers.update(ALT_HEADERS)
            r = s.get(url, headers=headers, timeout=TIMEOUT)
            s.headers.clear()
            s.headers.update(old)
        r.raise_for_status()
        return r
    except requests.HTTPError as e:
        # Re-raise for caller to decide on local-HTML fallback
        raise e

def soup_from_response(resp: requests.Response) -> BeautifulSoup:
    return BeautifulSoup(resp.text, "html.parser")

def soup_for_url(s: requests.Session, url: str, referer: str | None = None) -> BeautifulSoup:
    r = get_with_anti403(s, url, referer=referer)
    return soup_from_response(r)

def ensure_pdf(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    return ("pdf" in ctype) or (resp.content[:5] == b"%PDF-")

def parse_folder(issue_soup: BeautifulSoup) -> tuple[str, str, str]:
    """
    Extract Volume, Issue, Year from:
      <h3 class="sep citation section__header">Volume 07, Issue 01 (February 2020)</h3>
    """
    def norm(t: str) -> str:
        return re.sub(r"\s+", " ", (t or "").replace("\xa0", " ")).strip()

    vol = iss = yr = ""
    h3 = issue_soup.find("h3", class_=re.compile(r"\bsection__header\b"))
    if h3:
        txt = norm(h3.get_text())
        m = re.search(r"Volume\s+(\d+)\s*,\s*Issue\s+(\d+).*?\b((?:19|20)\d{2})\b", txt, re.I)
        if m:
            vol, iss, yr = m.group(1), m.group(2), m.group(3)

    if not (vol and iss and yr):
        title = issue_soup.title.get_text(" ", strip=True) if issue_soup.title else ""
        m_vol = re.search(r"Vol(?:ume)?\s*(\d+)", title, re.I)
        m_iss = re.search(r"(?:Issue|No\.?)\s*(\d+)", title, re.I)
        m_yr  = re.search(r"(19|20)\d{2}", issue_soup.get_text(" ", strip=True))
        vol = vol or (m_vol.group(1) if m_vol else "")
        iss = iss or (m_iss.group(1) if m_iss else "")
        yr  = yr or (m_yr.group(0) if m_yr else "")

    vol = vol.zfill(2) if vol.isdigit() else (vol or "XX")
    iss = iss.zfill(2) if iss.isdigit() else (iss or "YY")
    yr  = yr or "Year"
    return vol, iss, yr

def collect_issue_articles(issue_soup: BeautifulSoup) -> list[dict]:
    """ From issue page:
        - title from h5.issue-item__title a[href]
        - doi path from href (/doi/10.1142/...)
    """
    items = []
    for a in issue_soup.select("h5.issue-item__title a[href]"):
        href = (a.get("href") or "").strip()
        if not href.startswith("/doi/"):
            continue
        title = a.get_text(" ", strip=True)
        items.append({"title": title, "doi_path": href})
    return items

def epdf_to_pdf_url(epdf_html: str, doi: str) -> str:
    """ Prefer explicit 'download=true' link if present; otherwise construct it. """
    m = re.search(r'href=["\'](/doi/pdf/%s\?download=true)["\']' % re.escape(doi), epdf_html)
    if m:
        return urljoin(BASE, m.group(1))
    return f"{BASE}/doi/pdf/{doi}?download=true"

def fetch_issue_soup_with_fallback(s: requests.Session, issue_url: str) -> BeautifulSoup:
    # Warm-up
    try:
        s.headers.update(PRIMARY_HEADERS)
        s.get(BASE, timeout=TIMEOUT)
        time.sleep(0.3)
        s.get(f"{BASE}/loi/vjcs", timeout=TIMEOUT)
        time.sleep(0.3)
    except Exception:
        pass

    # Try main fetch with strong Referer
    try:
        return soup_for_url(s, issue_url, referer=f"{BASE}/loi/vjcs")
    except requests.HTTPError as e:
        if e.response is not None and e.response.status_code == 403:
            print("[WARN] 403 on issue page. If it persists, you can paste a saved HTML file path.")
        raise

def main():
    issue_url = input("Paste VJCS issue URL (e.g., https://www.worldscientific.com/toc/vjcs/07/01): ").strip()
    if not issue_url:
        print("No URL provided. Exiting.")
        return

    s = new_session()
    print(f"[INFO] Using {'cloudscraper' if USING_CLOUDSCRAPER else 'requests'} session")
    print("[INFO] Fetching issue page…")

    issue_soup = None
    try:
        issue_soup = fetch_issue_soup_with_fallback(s, issue_url)
    except requests.HTTPError as e:
        if e.response is not None and e.response.status_code == 403:
            local_path = input("Paste local HTML path for the issue page (or leave blank to abort): ").strip()
            if not local_path:
                print("Aborted due to 403.")
                return
            html = Path(local_path).read_text(encoding="utf-8", errors="ignore")
            issue_soup = BeautifulSoup(html, "html.parser")
        else:
            raise

    vol, iss, yr = parse_folder(issue_soup)
    outdir = Path(f"VJCS_Vol{vol}_Issue{iss}_{yr}")
    outdir.mkdir(parents=True, exist_ok=True)
    log_csv = outdir / f"VJCS_Vol{vol}_Issue{iss}_{yr}_log.csv"

    rows = collect_issue_articles(issue_soup)
    print(f"[INFO] Found {len(rows)} articles on this issue page")

    saved = 0
    with log_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["Title", "DOI", "PDF URL", "Filename", "Status"])

        for i, it in enumerate(rows, 1):
            title = sanitize_filename(it["title"])
            doi_path = it["doi_path"]                  # /doi/10.1142/...
            doi = doi_path.split("/doi/")[-1]          # 10.1142/...
            epdf_url = urljoin(BASE, "/doi/epdf/" + doi)

            try:
                # Grab epdf page (referer: issue page)
                epdf_resp = get_with_anti403(s, epdf_url, referer=issue_url)
                pdf_url = epdf_to_pdf_url(epdf_resp.text, doi)

                # Download PDF (referer: epdf page)
                pdf_resp = get_with_anti403(s, pdf_url, referer=epdf_url)
                if not ensure_pdf(pdf_resp):
                    w.writerow([title, doi, pdf_url, "", "Skipped (not a PDF response)"])
                    print(f"[{i}] ❌ Not a PDF: {title}")
                    continue

                fname = f"{title}.pdf"
                (outdir / fname).write_bytes(pdf_resp.content)
                w.writerow([title, doi, pdf_url, fname, "OK"])
                print(f"[{i}] ✅ Saved: {fname}")
                saved += 1
                time.sleep(PAUSE)
            except Exception as e:
                w.writerow([title, doi, "", "", f"Error: {e}"])
                print(f"[{i}] ❌ Error: {e}")

    print(f"\nDone! {saved} PDFs saved in {outdir}")
    print(f"Log: {log_csv}")

if __name__ == "__main__":
    main()
